check spelling

falvour

#load dependencies
require(png)        #for loading images
require(plyr)       
require(dplyr)      #for dataframe manipulation
require(osmdata)    #to import data from open street map
require(Hmisc)      
require(usedist)    #For computing distances
require(leaflet)    #interactive plotting of geospatial data
require(sf)         #handling geospatial data
require(geosphere)  #computing on geospatial data

require(plotly)     #interactive plotting
require(knitr)      #markdown utilities
require(caret)      #machine learning
require(xgboost)    #machine learning


require(rgdal)
require(RColorBrewer)
require(htmltools)

#Other packages for scripts run outside this document
#require(doParallel) #parallelisation
#require(foreach)    #parallelisation
#require(gplots)     #plotting heatmap

Synopsis

Introduction

The data have been downloaded from the online property website domain.com.au by a member of the online data science community kaggle.com, you can see the post here. Thank you to user anthonypino.

The primary objectives of this analysis are threefold:

  • Prediction: It would be useful to be able to predict the sale price of a property

  • Description: Understanding what makes a valuable property in Melbourne is useful knowledge for any home owner, it also makes for an interesting analysis.

  • Model Assessment: We’re going to fit a variety of model, their accuracy will be compared, along with any strengths/weaknesses specific to this dataset.

See the code by hitting the buttons

Data Preparation

Quality Check

#----read in data
property_data <- read.csv('original_data/Melbourne_housing_FULL.csv')

#----rename columns
names(property_data) <- c('suburb','add','nrooms','type','price','method','agent','date',
              'precomputeddist','pc','nbedroom','nbathroom','ncar','land_area',
              'building_area','year_built','council_area','lat','lng','region',
              'propcount')

#----recast types
property_data$date = as.Date(property_data$date,format='%d/%m/%Y')
property_data$suburb = property_data$suburb %>% 
    as.character %>% capitalize %>% as.factor
property_data$add = as.character(property_data$add)
property_data$propcount = as.numeric(property_data$propcount)
property_data$precomputeddist = as.numeric(property_data$precomputeddist)

#----compute distance from city
MELBOURNE_CENTRE = c(144.9631,-37.8136)
locs = select(property_data,c(lng,lat))
property_data$dist_cbd = apply(locs,1,function(loc){distm(loc,MELBOURNE_CENTRE)})

#give every object an ID
property_data$ID = as.character(1:nrow(property_data))

After parsing and an ID column is appended to the data, the table has 34857 records of sales, each with 23 features. View the first 100 rows of the data here.

discuss the data in its default form

write here download the data yourself here

#parse government data
gov_median_data <- read.csv('other_data/suburb_house_2017_edited.csv',stringsAsFactors = F)
names(gov_median_data)[1] <- 'suburb'
gov_median_data$suburb <- gov_median_data$suburb %>% tolower %>% capitalize

#limit property_data to 2017
property_data_2017 <- subset(property_data,date<'2018-01-01' & date > '2016-12-31')
#limit to houses
property_data_2017 = property_data_2017[property_data_2017$type=='h',]


#select relevant columns
gov_median_data =  gov_median_data %>% select(c(suburb,X2017))
#rename column
names(gov_median_data)[2] <- 'median_price'
property_data_2017 <- group_by(property_data_2017,suburb) %>% 
    summarise(median_price = median(price,na.rm=T),count=n())

#record how many properties were sold in this area, compared to total
property_data_2017$prop <- property_data_2017$count/sum(property_data_2017$count)

comparison_data = merge(property_data_2017,gov_median_data,by='suburb',suffixes = c('_pd','_gov'))


plot_ly(data=comparison_data,
        x=~median_price_gov,
        y=~median_price_pd,
        type='scatter',
        mode='markers',
        text=paste0(comparison_data$suburb,'\n',comparison_data$count,' sales'),
        hoverinfo='text'
) %>% add_lines(x=c(0,4e+6),y=c(0,4e+6),inherit = F) %>%
    layout(showlegend=F,
           xaxis=list('title'='Median Sale Price in our Dataset'),
           yaxis=list('title'='Median'))

skip alot of this, or collapse not bold

Exploration and Cleaning

NA values

fig <- readPNG('na_heatmap.png')
plot.new()
rasterImage(fig,0,0,1,1)

Geospatial Data

#------------------------read suburb data---------------------------------------

#read in suburb boundaries
locs <-  readOGR('other_data/VIC_LOCALITY_POLYGON_shp.shp',
                 GDAL1_integer64_policy = TRUE,stringsAsFactors = F,verbose=F)
#locs$VIC_LOCA_2 has suburb names, parse them to be like analysis data
locs$VIC_LOCA_2 <- capitalize(tolower(locs$VIC_LOCA_2))


#--------------------aggregate data used in analysis----------------------------
suburb_data <- group_by(property_data,suburb) %>% summarise(
    median_price = median(price,na.rm=T),
    q25 = quantile(price,0.25,na.rm=T),
    q75 = quantile(price,0.75,na.rm=T),
    mean_lng = mean(lng,na.rm=T),
    mean_lat = mean(lat,na.rm=T),
    count0 = n(),
    count = sum(!is.na(price))
    )

#---------------------------combine suburb and analysis data--------------------
#find suburbs in both datasets
locs_subset <- locs[which(locs$VIC_LOCA_2 %in% levels(suburb_data$suburb)),]

#merge data from suburb_data to locs_subset
merged <- merge(locs_subset,suburb_data,by.x='VIC_LOCA_2',by.y='suburb')

#----------------------------setup interactive text-----------------------------
prettify <- function(number) {
    format(round(number,0),big.mar=',',scientific = FALSE)
}
merged$string <- paste0(
    merged$VIC_LOCA_2,
    '<br>median price: $',prettify(merged$median_price),
    '<br>25th%: $',prettify(merged$q25),
    '<br>75th%: $',prettify(merged$q75),
    '<br>count: ',merged$count
    )

#----------------------------setup colours--------------------------------------
merged$median_perc <- percent_rank(merged$median_price)
pal <- colorNumeric('Blues',domain=range(merged$median_perc,na.rm=T))

#--------------------------------plot-------------------------------------------
lngview <- mean(merged$mean_lng,na.rm=T)
latview <- mean(merged$mean_lat,na.rm=T)
leaflet(merged) %>% addTiles() %>% 
    setView(lat=latview,lng =lngview , zoom=9) %>% 
    addPolygons(fillColor = ~pal(median_perc),
                fillOpacity = 1,
                weight=1,
                label=~lapply(string,HTML)
                )
#----------------------lnglat
leaflet(property_data) %>% addTiles %>% addCircles(lng=~lng,lat=~lat,
                                                   opacity = 0.5,
                                                   fillOpacity = 0.5,
                                                   color='#0078D7',
                                                   fillColor='#0078D7'
                                                   )
#lng and lat have very few missing, dont wanna bother imputing
loc_bool <- !is.na(property_data$lng) & !(is.na(property_data$lat))

Univariate EDA

#----------------------year_built
#1 property was apparaently built in 1196 and another next century
to_correct = which(property_data$year_built > 2030| property_data$year_built < 1788) 
property_data[to_correct,'year_built'] = NA
year_built_bool <- !is.na(property_data$year_built)
#--------------------ncar
ggplot(data= property_data, aes(x=ncar,y=log(price))) + 
    geom_jitter(alpha=0.4,color='#0078D7')

#limit the scope of analysis to properties with fewer than 7 car parks
ncar_bool <- !is.na(property_data$ncar) & property_data$ncar<=6
#--------------------nbathroom
ggplot(property_data,aes(x=nbathroom,y=log(price))) + 
    geom_jitter(alpha=0.4,color='#0078D7')

#limit the analysis to properties with nbathroom <= 4
bath_bool <- !is.na(property_data$nbathroom) & (property_data$nbathroom<=4 & property_data$nbathroom>0)
#---------------------nrooms
ggplot(property_data,aes(x=nrooms,y=log(price))) + 
    geom_jitter(alpha=0.4,color='#0078D7')

#limit the analysis to properties with nrooms <=6
nroom_bool <- property_data$nrooms<=6 & property_data$nrooms>0
#---------------------nbedroom
sum(is.na(property_data$nbedroom))
## [1] 8217
has_bedroom = !is.na(property_data$nbedroom)
cor(property_data$nbedroom[has_bedroom],property_data$nrooms[has_bedroom])
## [1] 0.9467546
#dont include in analysis
#----------------------building_area
ggplot(property_data,aes(x=building_area,y=log(price))) + 
    geom_point(alpha=0.4,color='#0078D7')

#make ==0 NA
property_data[is.na(property_data$building_area) | property_data$building_area==0,'building_area'] = NA


#limit analysis to buildings with less than 1000 building_area
BA_bool <- !is.na(property_data$building_area) & property_data$building_area<1000
#----------------------land_area
ggplot(property_data,aes(x=land_area,y=log(price))) + 
    geom_point(alpha=0.4,color='#0078D7')

#limit analysis to properties with land_area < 10000
land_bool <- !is.na(property_data$land_area) & property_data$land_area < 10000
#---------------------actions

#gather bools of which rows can be kept, intersection will be kept
property_data <- property_data[nroom_bool & bath_bool & land_bool & BA_bool & ncar_bool &loc_bool & year_built_bool,]

#replot
ggplot(property_data,aes(x=building_area,y=log(price))) + 
    geom_point(alpha=0.4,color='#0078D7')

ggplot(property_data,aes(x=land_area,y=log(price))) + 
    geom_point(alpha=0.4,color='#0078D7')

Validation

The validation scheme is as follows:

This scheme has two main benefits over other popular methods:

  • It is fair. explain this

  • It used all of the training/ensemble validation data for the final model. explain this

Validation Scheme Diagram

Validation Scheme Diagram

#set seed for reproducibility
set.seed(8236)

#remove data with missing price
no_pricing <- is.na(property_data$price)
property_data <- property_data[!is.na(property_data$price),]

#split data into train/ensemble-validation and test
train_ensbl_Index <- createDataPartition(property_data$price,times=1,p=0.8,list=F)
train_data = property_data[train_ensbl_Index,]
test = property_data[-train_ensbl_Index,]

### now split train_data into train0 and train1
splitIndex = createDataPartition(train_data$price,times=1,p=0.75,list=F)
train0 = train_data[splitIndex,]
train1 = train_data[-splitIndex,]

#Split train0 into 5 folds
KFOLD <- 5
folds = createFolds(train0$price,k=KFOLD,list=TRUE)
from_report = property_data
save(from_report,file='property_data_from_Rmd_file.Robject')

Reconstruction

na_prop = sapply(property_data,function(x){mean(is.na(x))})
na_prop_df = data.frame('feature'=names(na_prop),'prop_na'=na_prop)
na_prop_df = na_prop_df[na_prop_df$prop_na>0,]
na_prop_df = na_prop_df[order(na_prop_df$prop_na,decreasing = T),]
na_prop_df$prop_na = round(na_prop_df$prop_na,2)

#write a 
na_prop_df  %>% kable(row.names=F,
                      col.names = c('Feature','Proportion of Objects with NA Values'))

Feature Proportion of Objects with NA Values ——– ————————————-

note recon after validation

include some plots with the red coloring

dont harp on

Importing data from OSM

Primary Models

consider including a pic of error/sample size to justify not imputing but point out i explored it

Gradient Boosted Trees

The model fitting begins with gradient boosted descision trees, specifically the xgboost package, an R api for the popular software library.

Polar Coordinates

Looking at the map above, it appears

K-Nearest Neighbours

point out epanechnikov chosen ahead of time, said to be optimal in a rmse sense, (said wiki, sourced a link, dont have money to buy the paper)

**acknowledge limitations of CV in that i chose feats before CV, not to worry, it wont

Generalised Additive Model

Now an additive model is fitted using the gam package. This model was fitted last as it relies the most on an understanding of the data.

The model expression was constructed manually, the result of first exploration and then tuning the splines’ freedom. Simplicity was favoured over complexity to avoid overfitting as a consequence of lengthy experimentation with the data.

caret was not used this time, it has a habit of oversimplifying the fitting interface.

Ensembling

Discussion

Conclusion